%pip install pandas
%pip install seaborn
%pip install sklearn
%pip install plotly
%pip install scipy
%pip install nbformat
%pip install bioinfokit
Requirement already satisfied: pandas in ./venv/lib/python3.10/site-packages (1.5.1) Requirement already satisfied: numpy>=1.21.0 in ./venv/lib/python3.10/site-packages (from pandas) (1.23.4) Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.10/site-packages (from pandas) (2022.5) Requirement already satisfied: python-dateutil>=2.8.1 in ./venv/lib/python3.10/site-packages (from pandas) (2.8.2) Requirement already satisfied: six>=1.5 in ./venv/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0) Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: seaborn in ./venv/lib/python3.10/site-packages (0.12.1) Requirement already satisfied: pandas>=0.25 in ./venv/lib/python3.10/site-packages (from seaborn) (1.5.1) Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.10/site-packages (from seaborn) (1.23.4) Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in ./venv/lib/python3.10/site-packages (from seaborn) (3.6.0) Requirement already satisfied: python-dateutil>=2.7 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2) Requirement already satisfied: fonttools>=4.22.0 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.38.0) Requirement already satisfied: packaging>=20.0 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (21.3) Requirement already satisfied: contourpy>=1.0.1 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.5) Requirement already satisfied: pyparsing>=2.2.1 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9) Requirement already satisfied: pillow>=6.2.0 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.2.0) Requirement already satisfied: kiwisolver>=1.0.1 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4) Requirement already satisfied: cycler>=0.10 in ./venv/lib/python3.10/site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0) Requirement already satisfied: pytz>=2020.1 in ./venv/lib/python3.10/site-packages (from pandas>=0.25->seaborn) (2022.5) Requirement already satisfied: six>=1.5 in ./venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0) Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: sklearn in ./venv/lib/python3.10/site-packages (0.0) Requirement already satisfied: scikit-learn in ./venv/lib/python3.10/site-packages (from sklearn) (1.1.2) Requirement already satisfied: threadpoolctl>=2.0.0 in ./venv/lib/python3.10/site-packages (from scikit-learn->sklearn) (3.1.0) Requirement already satisfied: joblib>=1.0.0 in ./venv/lib/python3.10/site-packages (from scikit-learn->sklearn) (1.2.0) Requirement already satisfied: numpy>=1.17.3 in ./venv/lib/python3.10/site-packages (from scikit-learn->sklearn) (1.23.4) Requirement already satisfied: scipy>=1.3.2 in ./venv/lib/python3.10/site-packages (from scikit-learn->sklearn) (1.9.3) Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: plotly in ./venv/lib/python3.10/site-packages (5.10.0) Requirement already satisfied: tenacity>=6.2.0 in ./venv/lib/python3.10/site-packages (from plotly) (8.1.0) Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: scipy in ./venv/lib/python3.10/site-packages (1.9.3) Requirement already satisfied: numpy<1.26.0,>=1.18.5 in ./venv/lib/python3.10/site-packages (from scipy) (1.23.4) Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: nbformat in ./venv/lib/python3.10/site-packages (5.7.0) Requirement already satisfied: jupyter-core in ./venv/lib/python3.10/site-packages (from nbformat) (4.11.2) Requirement already satisfied: jsonschema>=2.6 in ./venv/lib/python3.10/site-packages (from nbformat) (4.16.0) Requirement already satisfied: fastjsonschema in ./venv/lib/python3.10/site-packages (from nbformat) (2.16.2) Requirement already satisfied: traitlets>=5.1 in ./venv/lib/python3.10/site-packages (from nbformat) (5.5.0) Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in ./venv/lib/python3.10/site-packages (from jsonschema>=2.6->nbformat) (0.18.1) Requirement already satisfied: attrs>=17.4.0 in ./venv/lib/python3.10/site-packages (from jsonschema>=2.6->nbformat) (22.1.0) Note: you may need to restart the kernel to use updated packages.
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import norm, boxcox
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from bioinfokit.visuz import cluster
wines_df = pd.read_csv('./winequality-red.csv')
corr = wines_df.corr()
plt.figure(figsize = (12, 12))
mask = np.triu(np.ones_like(corr, dtype = bool))
heatmap = sns.heatmap(
corr,
vmin = -1,
center = 0,
vmax = 1,
mask = mask,
annot = True,
square = True,
cmap = sns.diverging_palette(20, 220, n = 200),
)
heatmap.set_xticklabels(
heatmap.get_xticklabels(),
rotation = 45,
horizontalalignment = 'right'
)
[Text(0.5, 0, 'fixed acidity'), Text(1.5, 0, 'volatile acidity'), Text(2.5, 0, 'citric acid'), Text(3.5, 0, 'residual sugar'), Text(4.5, 0, 'chlorides'), Text(5.5, 0, 'free sulfur dioxide'), Text(6.5, 0, 'total sulfur dioxide'), Text(7.5, 0, 'density'), Text(8.5, 0, 'pH'), Text(9.5, 0, 'sulphates'), Text(10.5, 0, 'alcohol'), Text(11.5, 0, 'quality')]
wines_df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
wines_df.shape
(1599, 12)
wines_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
wines_df.hist(figsize=(24, 24))
plt.show()
wines_df.skew(axis = 0, skipna = True)
fixed acidity 0.982751 volatile acidity 0.671593 citric acid 0.318337 residual sugar 4.540655 chlorides 5.680347 free sulfur dioxide 1.250567 total sulfur dioxide 1.515531 density 0.071288 pH 0.193683 sulphates 2.428672 alcohol 0.860829 quality 0.217802 dtype: float64
skews = ['residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates']
wine_colors = ['#3B429F', '#AA7DCE', '#F5D7E3', '#F4A5AE', '#A8577E']
df = wines_df.copy()
for i in skews:
print(f'{i}:')
mu, sigma = norm.fit(df[i])
sns.set_style('darkgrid')
sk_plot = sns.histplot(
df[i],
kde = True,
color = wine_colors[0],
bins = 40, # type: ignore
stat = 'density'
)
aux = np.linspace(*sk_plot.get_xlim(), 100) # type: ignore
sk_plot.plot(aux, norm.pdf(aux, mu, sigma))
plt.title('Skeweed')
plt.show()
print(f'\tmu: {mu}')
print(f'\tsigma: {sigma}')
df[i], lam = boxcox(df[i]) # type: ignore
mu, sigma = norm.fit(df[i])
sns.set_style('darkgrid')
norm_plot = sns.histplot(
df[i],
kde = True,
color = wine_colors[1],
bins = 40, # type: ignore
stat = 'density'
)
aux = np.linspace(*norm_plot.get_xlim(), 100) # type: ignore
norm_plot.plot(aux, norm.pdf(aux, mu, sigma))
plt.title('Transformed')
plt.show()
print(f'\tmu: {mu}')
print(f'\tsigma: {sigma}')
print('\n#########################################################################')
residual sugar:
mu: 2.53880550343965 sigma: 1.4094871124880495
mu: 0.5135558435839319 sigma: 0.11004598444169933 ######################################################################### chlorides:
mu: 0.08746654158849279 sigma: 0.04705058260331571
mu: -4.763473789691171 sigma: 0.9624685457110036 ######################################################################### free sulfur dioxide:
mu: 15.874921826141339 sigma: 10.45688561493072
mu: 3.1290485296425143 sigma: 0.9855086968773705 ######################################################################### total sulfur dioxide:
mu: 46.46779237023139 sigma: 32.88503665178374
mu: 3.9657192644535324 sigma: 0.8459104716754929 ######################################################################### sulphates:
mu: 0.6581488430268917 sigma: 0.16945396724179546
mu: -0.6096631238347405 sigma: 0.34058620296236386 #########################################################################
wines_df['quality'].value_counts()
5 681 6 638 7 199 4 53 8 18 3 10 Name: quality, dtype: int64
sns.set(rc = { 'figure.figsize': (5, 5) })
sns.countplot(x = wines_df['quality'])
<AxesSubplot: xlabel='quality', ylabel='count'>
sns.set(rc = { 'figure.figsize': (18, 24) })
fig, axes = plt.subplots(4, 3)
sns.boxplot(
x = 'quality',
y = 'fixed acidity',
data = wines_df,
ax = axes[0][0]
)
sns.boxplot(
x = 'quality',
y = 'volatile acidity',
data = wines_df,
ax = axes[0][1]
)
sns.boxplot(
x = 'quality',
y = 'citric acid',
data = wines_df,
ax = axes[0][2]
)
sns.boxplot(
x = 'quality',
y = 'residual sugar',
data = wines_df,
ax = axes[1][0]
)
sns.boxplot(
x = 'quality',
y = 'chlorides',
data = wines_df,
ax = axes[1][1]
)
sns.boxplot(
x = 'quality',
y = 'free sulfur dioxide',
data = wines_df,
ax = axes[1][2]
)
sns.boxplot(
x = 'quality',
y = 'total sulfur dioxide',
data = wines_df,
ax = axes[2][0]
)
sns.boxplot(
x = 'quality',
y = 'density',
data = wines_df,
ax = axes[2][1]
)
sns.boxplot(
x = 'quality',
y = 'pH',
data = wines_df,
ax = axes[2][2]
)
sns.boxplot(
x = 'quality',
y = 'sulphates',
data = wines_df,
ax = axes[3][0]
)
sns.boxplot(
x = 'quality',
y = 'alcohol',
data = wines_df,
ax = axes[3][1]
)
fig.delaxes(axes[3][2])
X = wines_df.iloc[:, 0:11]
target = wines_df['quality'].to_numpy()
X.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 |
X_st = StandardScaler().fit_transform(X)
X_st
array([[-0.52835961, 0.96187667, -1.39147228, ..., 1.28864292,
-0.57920652, -0.96024611],
[-0.29854743, 1.96744245, -1.39147228, ..., -0.7199333 ,
0.1289504 , -0.58477711],
[-0.29854743, 1.29706527, -1.18607043, ..., -0.33117661,
-0.04808883, -0.58477711],
...,
[-1.1603431 , -0.09955388, -0.72391627, ..., 0.70550789,
0.54204194, 0.54162988],
[-1.39015528, 0.65462046, -0.77526673, ..., 1.6773996 ,
0.30598963, -0.20930812],
[-1.33270223, -1.21684919, 1.02199944, ..., 0.51112954,
0.01092425, 0.54162988]])
pca_out = PCA().fit(X_st)
loadings = pca_out.components_
loadings
array([[ 0.48931422, -0.23858436, 0.46363166, 0.14610715, 0.21224658,
-0.03615752, 0.02357485, 0.39535301, -0.43851962, 0.24292133,
-0.11323206],
[-0.11050274, 0.27493048, -0.15179136, 0.27208024, 0.14805156,
0.51356681, 0.56948696, 0.23357549, 0.00671079, -0.03755392,
-0.38618096],
[-0.12330157, -0.44996253, 0.23824707, 0.10128338, -0.09261383,
0.42879287, 0.3224145 , -0.33887135, 0.05769735, 0.27978615,
0.47167322],
[-0.22961737, 0.07895978, -0.07941826, -0.37279256, 0.66619476,
-0.04353782, -0.03457712, -0.17449976, -0.00378775, 0.55087236,
-0.12218109],
[-0.08261366, 0.21873452, -0.05857268, 0.73214429, 0.2465009 ,
-0.15915198, -0.22246456, 0.15707671, 0.26752977, 0.22596222,
0.35068141],
[ 0.10147858, 0.41144893, 0.06959338, 0.04915555, 0.30433857,
-0.01400021, 0.13630755, -0.3911523 , -0.52211645, -0.38126343,
0.36164504],
[-0.35022736, -0.5337351 , 0.10549701, 0.29066341, 0.37041337,
-0.11659611, -0.09366237, -0.17048116, -0.02513762, -0.44746911,
-0.3276509 ],
[-0.17759545, -0.07877531, -0.37751558, 0.29984469, -0.35700936,
-0.2047805 , 0.01903597, -0.23922267, -0.56139075, 0.37460432,
-0.21762556],
[-0.19402091, 0.1291103 , 0.38144967, -0.00752295, -0.11133867,
-0.63540522, 0.59211589, -0.02071868, 0.16774589, 0.05836706,
-0.03760311],
[-0.24952314, 0.36592473, 0.62167708, 0.09287208, -0.21767112,
0.24848326, -0.37075027, -0.23999012, -0.0109696 , 0.11232046,
-0.3030145 ],
[ 0.63969145, 0.0023886 , -0.0709103 , 0.18402996, 0.05306532,
-0.05142086, 0.0687016 , -0.5673319 , 0.3407109 , 0.06955538,
-0.31452591]])
pca_out.explained_variance_
array([3.10107182, 1.92711489, 1.55151379, 1.21399175, 0.95989238,
0.66002104, 0.58415655, 0.42322138, 0.34485779, 0.18144664,
0.05959558])
pca_scores = PCA().fit_transform(X_st)
pca_scores
array([[-1.61952988, 0.45095009, -1.77445415, ..., 0.00509804,
-0.26775943, 0.04863012],
[-0.79916993, 1.85655306, -0.91169017, ..., -0.52070667,
0.06283285, -0.13814189],
[-0.74847909, 0.88203886, -1.17139423, ..., -0.08685693,
-0.18744237, -0.11822866],
...,
[-1.45612897, 0.31174559, 1.12423941, ..., -0.80877339,
0.24224843, -0.40291033],
[-2.27051793, 0.97979111, 0.62796456, ..., -0.61224806,
0.77940384, 0.04092255],
[-0.42697475, -0.53669021, 1.6289552 , ..., 0.40430898,
0.77943963, -0.44978056]])
cluster.biplot(
cscore = pca_scores,
loadings = loadings,
labels = X.columns.values,
var1 = round(pca_out.explained_variance_ratio_[0] * 100, 2),
var2 = round(pca_out.explained_variance_ratio_[1] * 100, 2),
colorlist = target,
show = True,
dim = (10, 10),
dotsize = 10,
arrowlinewidth = 1
)
findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found. findfont: Font family 'Arial' not found.
fig = go.Figure(px.box(wines_df, y = 'alcohol', title = 'Box Plot Alcohol'))
fig.update_layout(title_x = 0.5)
fig.show('notebook')
fig = go.Figure(px.histogram(wines_df, x = 'pH', title = 'Histogram of pH'))
fig.update_layout(title_x = 0.5)
fig.show('notebook')